#Fig 1 - Panels 1 and 2

#This Script produces both panels of Figure 1

rm(list=ls());gc();gc();gc();gc();gc();gc();gc();gc()
library(Hmisc)
library(tidyverse)
library(dplyr)
library(hrbrthemes)
library(scales)
library(RColorBrewer)
library(viridis)
library(ggtext)

here::i_am("Scripts/Fig1Script.R")

library(here)
# Panel 1 ####

##First let's get our baseline:####
df<-readRDS(here("Data", "acrossanalysisset.rds"))


#Check to see that all donors have at least 2 donations
numdons<-df%>%
  group_by(finstate3dc)%>%
  summarise(numdons=n())

summary(numdons)

#Find out what the state breakdowns are for fundraisers and non-fundraisers
df %>%
  group_by(finstate3dc)%>%
  dplyr::mutate(edon=ifelse(sum(fundraiser)>0,1,0))%>%
  select(finstate3dc, edon,cSTATE)%>%
  unique()%>%
  group_by(edon,cSTATE)%>%
  dplyr::summarize(count=n())


#These are the targets we hit to create a sample for

#Create 1200 samples with these ratios, and then distribute them 
#across the dataset
numsims<-1200



#Get the donor ids and their wsd
kypart<-df[df$cSTATE=="KY",]
kypartdons<-kypart%>%
  group_by(finstate3dc)%>%
  dplyr::mutate(edon=ifelse(sum(fundraiser)>0,1,0))%>%
  dplyr::mutate(wsd=sqrt(wtd.var(recipient.cfscore,weights=rcAMOUNT)))%>%
  ungroup%>%
  select(finstate3dc, wsd,edon)%>%
  unique()
#saveRDS(kypartdons, "kypartdons.rds") 

mipart<-df[df$cSTATE=="MI",]
mipartdons<-mipart%>%
  group_by(finstate3dc)%>%
  dplyr::mutate(edon=ifelse(sum(fundraiser)>0,1,0))%>% #Are they an event donor?
  dplyr::mutate(wsd=sqrt(wtd.var(recipient.cfscore,weights=rcAMOUNT)))%>% #What is the donation-amount-weighted standard deviation of the ideology scores for targeted candidates
  ungroup%>%
  select(finstate3dc, wsd,edon)%>%
  unique()
#saveRDS(mipartdons, "mipartdons.rds") 

ohpart<-df[df$cSTATE=="OH",]
ohpartdons<-ohpart%>%
  group_by(finstate3dc)%>%
  dplyr::mutate(edon=ifelse(sum(fundraiser)>0,1,0))%>%
  dplyr::mutate(wsd=sqrt(wtd.var(recipient.cfscore,weights=rcAMOUNT)))%>%
  ungroup%>%
  select(finstate3dc, wsd,edon)%>%
  unique()
#saveRDS(ohpartdons, "ohpartdons.rds") 


allpartdons<-bind_rows(kypartdons, mipartdons,ohpartdons)
#saveRDS(allpartdons, "allpartdons.rds")

#set seed and sample
set.seed(82)

kysamps<-as.data.frame(replicate(numsims,sample(kypartdons$edon,length(kypartdons$edon),replace = FALSE)))
#saveRDS(kysamps, "kysampsacross.rds")
misamps<-as.data.frame(replicate(numsims,sample(mipartdons$edon,length(mipartdons$edon),replace = FALSE)))
#saveRDS(misamps, "misampsacross.rds")
ohsamps<-as.data.frame(replicate(numsims,sample(ohpartdons$edon,length(ohpartdons$edon),replace = FALSE)))
#saveRDS(ohsamps, "ohsampsacross.rds")



allsamps<-rbind(kysamps, misamps, ohsamps)
#saveRDS(allsamps, "allsampsacross.rds")


#Now attach to the important parts of the main dataset

dfan<-cbind(allpartdons, allsamps)

#saveRDS(dfan, "sampanalysisacross.rds")

#in each of these simulations, we have new "event donors" and "non-event donors"

#Let's focus on how to do this once with our actual data, before looping through:


medians<-dfan%>%
  group_by(edon)%>%
  dplyr::summarize(medians=median(wsd))

sampdist<-medians[2,2]-medians[1,2]

sampdist
# 0.06868137 This is the difference between the median donation-amount-weighted standard deviation of 
#target candidate ideology of event donors and that of non-event donors


#Let's try to do this for each of the simulations

#Going to want an empty place for results that is 1200 rows, 3 columns (simnum, wmeventdonor, wmnoneventdonors)
set.seed(82)

numsims<-1200

medresults <- vector(mode='list', length=numsims)




for (i in 1:numsims){
  print(i)
  dfan1<-dfan[,c(1,2, 3+i)]
  names(dfan1)[3]<-"edon" #THis is the simulated assignment to either event or non-event donor
  
medresults[[i]]<-dfan1 %>%
    group_by(edon)%>%
    dplyr::summarize(medians=median(wsd))

}
#saveRDS(medresults, "acrosssampsimresults1200.rds")


distances<-unlist(lapply(medresults,function(x){
  x[2,2]-x[1,2]
}))

#distribution of differences between event donors sd and non-event donors sd
hist(distances, breaks=100, col="red")


#Paper quality graphs - I want the distribution of distances first, with our distance 


df<-data.frame(label="(SD of Event Donors - SD of Non-Event Donors) in Sample: .0688",
               x = Inf, y = Inf, hjust = 0, vjust = 1)

breaks <- c(-.005,-.004,-.003,-.002,-.001,0,.001,.002,.003,.07)

p<-ggplot()+aes(distances) + 
  geom_histogram(aes(y=after_stat(density)),      # Histogram with density instead of count on y-axis
                 bins=300,
                 colour="black", fill="white") +
  geom_density(alpha=.2, fill="lightgrey")+
  labs(x="Difference between Median SD of Event Donors \n and Median SD of Non-Event Donors", y="Frequency")+
  theme_ipsum() +
  #geom_textbox(aes(label = 'Median SD of Event Donors - Median SD of Non-Event Donors in Sample: .0688', x = Inf, y = Inf),
  #          hjust = 1, vjust = 1)+
  scale_x_continuous(breaks = round(seq(min(distances), .08, by = 0.01),2)) +
  theme(
    axis.text.x=element_text(size=22),
    axis.text.y=element_text(size=22),
    axis.title.y = element_text(size=22),
    axis.title.x = element_text(size=22),
    legend.text=element_text(size=22),
    legend.title=element_text(size=22),
    strip.text.x = element_text(size = 22),
    plot.margin = margin(t=4,r = 0,  # Right margin
                         b = 0,  # Bottom margin
                         l = 0))+
  geom_vline(xintercept = .0687, linetype="longdash", linewidth = 3) + #LINE AT OUR SAMPLE FINDING FROM LINE 120 above
  geom_text(mapping = aes(x = .05, y = 450, label = "In-sample Value \n .069"),
            inherit.aes = FALSE,size=10)+
  geom_segment(aes(x = .0575, y = 400, xend = .065, yend = 400),
               arrow = arrow(length = unit(0.5, "cm")))

p


ggsave(here("Results","Fig1Panel1.png"), width = 12, height = 5)

#Get EPS version for the Manuscript:
ggsave(here("Results","Fig1Panel1.eps"), device="eps", width = 12, height = 5)


# Panel 2 - Within Analysis ####

## Load Data #####

df<-readRDS(here("Data","withinanalysisset.rds"))


df<-df %>%
  group_by(finstate3dc) %>%
  mutate(rid=row_number())
df$dc_rid<-paste(df$finstate3dc,df$rid,sep="_")

df2<-df[,which(colnames(df)%in%c("rcAMOUNT","finstate3dc","recipient.cfscore","fundraiser", "rid", "dc_rid"))]

#Get the weighted means of fundraiser, non-fundraisers, 

baselinemeans<-df2 %>%
  group_by(finstate3dc,fundraiser)%>%
  dplyr::summarize(wmean=weighted.mean(recipient.cfscore,w=rcAMOUNT))%>%
  ungroup()

#weighted sds of non-events
df3<-df2[df2$fundraiser==0,]
baselinesds<-df3 %>%
  group_by(finstate3dc)%>%
  dplyr::summarize(wsd=sqrt(wtd.var(recipient.cfscore,weights=rcAMOUNT)))%>%
  ungroup()

#join these
blmeansnonevent<-baselinemeans%>%
  filter(fundraiser==0)

blmeansevent<-baselinemeans%>%
  filter(fundraiser==1)

names(blmeansnonevent)[3]<-"newmean"

names(blmeansevent)[3]<-"ewmean"

blmeansnonevent$fundraiser<-NULL
blmeansevent$fundraiser<-NULL

baselineall<-merge(baselinesds, blmeansevent, by="finstate3dc")
baselineall<-merge(baselineall, blmeansnonevent, by="finstate3dc")

baselineall$absdist<-abs(baselineall$ewmean-baselineall$newmean)
baselineall$adjud<-ifelse(baselineall$absdist>baselineall$wsd, 1, 0)
baselineall$adjud2<-ifelse(baselineall$absdist>(2*baselineall$wsd), 1, 0)

#saveRDS(baselineall, "withindonoranalysisbaselines.rds")

#baselineall<-readRDS("withindonoranalysisbaselines.rds")

#There may be some differences in these numbers depending on floating point numbers, but
# these differences are all within small percentages, and do not affect our inference
#comparing the distribution in our sample to that of the simulations.

#One SD
mean(baselineall$adjud) #46.5%
#2SD
mean(baselineall$adjud2) #29.0%



#Now let's do the simulations ####
#Load data with simulations attached
#(Simulations created on HPC using code in Script "withindonorsims.R")

#First set
df1<-readRDS(here("Data", "withindonorsimdf1.rds"))
df2<-readRDS(here("Data", "withindonorsimdf201.rds"))
df3<-readRDS(here("Data", "withindonorsimdf401.rds"))
df4<-readRDS(here("Data", "withindonorsimdf601.rds"))
df5<-readRDS(here("Data", "withindonorsimdf801.rds"))
df6<-readRDS(here("Data", "withindonorsimdf1001.rds"))


#merge them
#put all data frames into list
df_list <- list(df1, df2, df3, df4, df5, df6)

#merge all data frames in list
dfall<-df_list %>% reduce(full_join, by=c("dc_rid", "fundraiser", "finstate3dc", "rcAMOUNT", "recipient.cfscore", "rid"))

#the indexing is a bit off for three of these so for those three simulations 
#(1st, 3rd, and 4th) the second one is blank and there is a 
#leftover simulation near the end - fix those:

dfall$fundraiser_s2<-dfall$V207.x
dfall$fundraiser_s2[is.na(dfall$fundraiser_s2)]<-0

dfall$fundraiser_s402<-dfall$V207.y
dfall$fundraiser_s402[is.na(dfall$fundraiser_s402)]<-0

dfall$fundraiser_s602<-dfall$V207
dfall$fundraiser_s602[is.na(dfall$fundraiser_s602)]<-0

dfall$V207<-NULL
dfall$V207.x<-NULL
dfall$V207.y<-NULL


#Check that the numbers look right
summary(colSums(dfall[, 7:1206], na.rm = TRUE))

#Loop
#Create a place for it - want a results df that has:
#simulation number (1-1200)
#donor id (finstate3dc; 44443)
#wmean(events)
#wmean(non-events)
#wsd(non-events)
#Thus, has to be 5 columns, first iteration is 44443 rows large




#First iteration
newdf<-dfall[,c(1:6, 6+1)]

firstnew<-newdf%>%
  group_by_at(c(3, 7))%>%
  dplyr::summarize(wmean=weighted.mean(recipient.cfscore,w=rcAMOUNT))%>%
  ungroup()

othernew<-newdf%>%
  group_by(finstate3dc)%>%
  dplyr::summarize(wsd=sqrt(wtd.var(recipient.cfscore,weights=rcAMOUNT)))%>%
  ungroup()

#join these
blmeansnonevent<-firstnew%>%
  filter(across(2, ~ . ==0))

blmeansevent<-firstnew%>%
  filter(across(2, ~ . ==1))

names(blmeansnonevent)[3]<-"newmean"

names(blmeansevent)[3]<-"ewmean"

blmeansnonevent[,2]<-NULL
blmeansevent[,2]<-NULL

baselineall<-merge(othernew, blmeansevent, by="finstate3dc")
baselineall<-merge(baselineall, blmeansnonevent, by="finstate3dc")

baselineall$simulation<-1

matresults<-baselineall
matresultlist<-vector(mode='list', length=1200)

matresultlist[[1]]<-matresults

#Takes a while (~an hour) to run, but can run on regular computer 
for(i in 2:1200){
  print(i)
  newdf<-dfall[,c(1:6, 6+i)]
  
  firstnew<-newdf%>%
    group_by_at(c(3, 7))%>%
    dplyr::summarize(wmean=weighted.mean(recipient.cfscore,w=rcAMOUNT))%>%
    ungroup()
  
  othernew<-newdf%>%
    group_by(finstate3dc)%>%
    dplyr::summarize(wsd=sqrt(wtd.var(recipient.cfscore,weights=rcAMOUNT)))%>%
    ungroup()
  
  #join these
  blmeansnonevent<-firstnew%>%
    filter(across(2, ~ . ==0))
  
  blmeansevent<-firstnew%>%
    filter(across(2, ~ . ==1))
  
  names(blmeansnonevent)[3]<-"newmean"
  
  names(blmeansevent)[3]<-"ewmean"
  
  blmeansnonevent[,2]<-NULL
  blmeansevent[,2]<-NULL
  
  baselineall<-merge(othernew, blmeansevent, by="finstate3dc")
  baselineall<-merge(baselineall, blmeansnonevent, by="finstate3dc")
  
  baselineall$simulation<-i
  
  matresultlist[[i]]<-baselineall
  
}

#saveRDS(matresultlist, "simulationmeansandsdswc.rds")

#matresults2<-readRDS("simulationmeansandsdswc.rds")
matresults2<-matresultlist

matresults<-as.data.frame(matrix(nrow=1200, ncol=2))

colnames(matresults)<-c("meanadjud", "meanadjud2")

for(i in 1:1200){
  
  absdist<-abs(matresults2[[i]]$ewmean-matresults2[[i]]$newmean)
  matresults$meanadjud[i]<-mean(ifelse(absdist>matresults2[[i]]$wsd, 1, 0))
  matresults$meanadjud2[i]<-mean(ifelse(absdist>(2*matresults2[[i]]$wsd), 1, 0))
  
}


#saveRDS(matresults, "simulationmeansandsds.rds")


#Paper quality graphs - I want the distribution of distances first, with our distance 
# put in the legend maybe?

#matresults<-readRDS("simulationmeansandsds.rds")

#multiply by 100 for percent
matresults$meanadjud2100<-matresults$meanadjud2*100

breaks<-c(.3,.305,.31,.315,.32,.42,.45)
breaks2<-c(.044,.046,.048,.049,.05,.051,.052,.053,.054,.25,.28)


#Two standard Deviations
p2<-ggplot(matresults)+aes(meanadjud2100) + 
  geom_histogram(aes(y=after_stat(density)),      # Histogram with density instead of count on y-axis
                 binwidth=.0005,
                 colour="black", fill="white") +
  geom_density(alpha=.2, fill="lightgrey")+
  labs(x="Percent of Donors Where Distance between Means \n is More than Two Standard Deviations", y="Frequency")+
  theme_ipsum() +
  # geom_textbox(aes(label = 'Percent of Donors in Sample Who Qualify: 27.4%', x = Inf, y = Inf),
  #             hjust = 1, vjust = 1)+
  scale_x_continuous(breaks = round(seq(min(matresults$meanadjud2100)-3.5, 32, by = 3),1)) +
  theme(
    axis.text.x=element_text(size=22),
    axis.text.y=element_text(size=22),
    axis.title.y = element_text(size=22),
    axis.title.x = element_text(size=22),
    legend.text=element_text(size=22),
    legend.title=element_text(size=22),
    strip.text.x = element_text(size = 22),
    plot.margin = margin(t=4,r = 0,  # Right margin
                         b = 0,  # Bottom margin
                         l = 0))+
  geom_vline(xintercept = 29, linetype="longdash", linewidth = 3) + 
  geom_text(mapping = aes(x = 24.5, y = 15, label = "In-sample Value \n 29.0%"),
            inherit.aes = FALSE, size=10)+
  geom_segment(aes(x = 27, y = 12.75, xend = 28.5, yend = 12.75),
               arrow = arrow(length = unit(0.5, "cm")))

p2


ggsave(here("Results", "Fig1Panel2.png"), width = 12, height = 5)

#Get EPS version for the Manuscript:
ggsave(here("Results", "Fig1Panel2.eps"),device="eps", width = 12, height = 5)

